ANGOT_al_2025_Evolution

Author

Victor Angot

This analytical pipeline is supplementary to the article ‘Ecophylogenetic patterns of rhizosphere bacterial community assembly in Pisum spp. (Fabaceae, Fabeae) reveal strong ecological filtering’

The principal analyses outlined in the Materials and Methods section of the manuscript are presented here. Temporal phylogenetic dispersion analyses are not included, as they strictly follow the methodology developed by Darcy et al. (2020).

1 Packages

Code
library(agricolae)
library(ape)
library(broom)
library(car)
library(dplyr)
library(DT)
library(geiger)
library(ggplot2)
library(openxlsx)
library(outliers)
library(plotly)
library(rcompanion)
library(readxl)
library(SYNCSA)
library(tibble)
library(tidyr)
library(vegan)

2 Ecological analyzes

2.1 Datas

Code
mydf=read.csv(file = "data/mydf.csv",sep=";")
design=read.csv(file = "data/design.csv",sep=";")
tree=read.tree(file="data/MCC_10000posterior.tre")

2.2 Richness outliers

Likely due to technical issues that do not reflect true biological variation, samples with aberrant richness were removed

Code
tmydf<- t(mydf)

R<- colSums((mydf !=0))
C<- estimateR(tmydf)
S<- diversity(tmydf,index="simpson")
H<- diversity (tmydf, index="shannon")
E = H/log(R)
H<- data.frame(H)
S<- data.frame(S)
E<- data.frame(E)
C<- t(C)
SR<- 1/(1-S)
depth=colSums(mydf)

alpha<- cbind(C,S,SR,H,E,depth)

alpha$se.chao1 <- NULL 
alpha$se.ACE<- NULL 
names(alpha)<- c("S","C","A","Simpson","SimpsonRec","H","E","depth") 
head(alpha)
              S   C   A   Simpson SimpsonRec         H         E depth
Pa_1_S1_E     3   3 NaN 0.5825871   2.395710 0.9437891 0.8590739   393
Pa_1_S1_R     3   3 NaN 0.6091813   2.558731 1.0088935 0.9183344   295
Pa_1_S1_Rp   24  24  24 0.9307954  14.449908 2.8667875 0.9020576  3968
Pa_10_S1_E   64  64  64 0.9642152  27.944809 3.7203631 0.8945582  1435
Pa_10_S1_R    2   2 NaN 0.3053865   1.439650 0.4833972 0.6973948   335
Pa_10_S1_Rp 598 598 598 0.9958001 238.102534 5.9845165 0.9360181 11227
Code
all(rownames(design)==rownames(alpha)) 
[1] TRUE
Code
alpha <- cbind(alpha,design) 

list_of_dfs <- split(alpha, alpha$comp)
list_of_dfs <- lapply(list_of_dfs, function(df) {
  df[order(-df$S), ]
})

hist(list_of_dfs$Bulk$S,breaks=100)

Code
list_of_dfs$Bulk$S
 [1] 2435 2331 2258 1781 1601 1551 1529 1513 1408 1372 1354 1310 1240 1125 1024
[16]  889  859  810  805  595  564  488  136  107  102   30   28   17   13    7
[31]    7    6
Code
mean(list_of_dfs$Bulk$S)
[1] 915.4688
Code
sd(list_of_dfs$Bulk$S)
[1] 750.5213
Code
outlier(list_of_dfs$Bulk$S)
[1] 2435
Code
scores(list_of_dfs$Bulk$S,type = ("chisq"),prob= 1)
       Dim1
site1  2435
site2  2331
site3  2258
site4  1781
site5  1601
site6  1551
site7  1529
site8  1513
site9  1408
site10 1372
site11 1354
site12 1310
site13 1240
site14 1125
site15 1024
site16  889
site17  859
site18  810
site19  805
site20  595
site21  564
site22  488
site23  136
site24  107
site25  102
site26   30
site27   28
site28   17
site29   13
site30    7
site31    7
site32    6
Code
summary(list_of_dfs$Bulk$S)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    6.0   105.8   874.0   915.5  1434.2  2435.0 
Code
outliers_Bulk <- rownames(list_of_dfs$Bulk[list_of_dfs$Bulk$S < 137, ])
outliers_Bulk
 [1] "Pe_NA_S2_Bulk_B"   "Ps_NA_S3_Bulk_A"   "Ps_NA_S1_Bulk_B"  
 [4] "Pf_NA_S1_Bulk_B_2" "Pa_NA_S3_Bulk_A"   "Pe_NA_S0_Bulk_A_2"
 [7] "Pa_NA_S3_Bulk_B_1" "Pa_NA_S1_Bulk_A_1" "Pa_NA_S0_Bulk_B"  
[10] "Pe_NA_S1_Bulk_A"  
Code
hist(list_of_dfs$R$S,breaks=100)

Code
list_of_dfs$R$S
 [1] 2461 2320 2195 2101 1954 1833 1598 1572 1558 1490 1459 1178 1143 1114 1089
[16] 1060 1049 1026 1019  879  830  786  745  676  674  594  557  539  511  508
[31]  497  451  394  380  368  329  296  292  287  266  265  232  221  220  181
[46]  136  126  121   65   60   46   29   22   22   19   18   14   14   13   13
[61]   12   12   11   10    8    5    5    4    3    3    2
Code
outliers_R <- rownames(list_of_dfs$R[list_of_dfs$R$S < 121, ])
outliers_R
 [1] "Pa_23_S2_R"   "Pf_26_S2_R"   "Pa_13_S2_R"   "Ps_10_S2_R"   "Pf_34_S1_R_2"
 [6] "Ps_8_S3_R"    "Pe_19_S1_R"   "Pf_21_S1_R"   "Pe_32_S1_R"   "Ps_30_S1_R"  
[11] "Pa_21_S3_R"   "Pf_24_S2_R"   "Pf_18_S1_R"   "Ps_27_S1_R"   "Pa_19_S1_R"  
[16] "Pe_25_S1_R"   "Pa_4_S1_R"    "Pa_27_S1_R"   "Ps_26_S2_R"   "Pa_18_S1_R"  
[21] "Pa_1_S1_R"    "Ps_33_S3_R"   "Pa_10_S1_R"  
Code
hist(list_of_dfs$Rp$S,breaks=100)

Code
list_of_dfs$Rp$S
 [1] 3370 2805 2014 1935 1821 1711 1673 1623 1529 1466 1431 1406 1358 1353 1253
[16] 1138 1068 1068 1007  990  934  905  858  852  761  735  734  732  725  697
[31]  651  619  610  605  598  580  580  522  518  513  505  478  434  429  420
[46]  391  382  376  352  312  306  262  209  207  196  174  153   94   66   46
[61]   38   33   25   24   20   20   19   19   19   19   16    6    4
Code
outliers_Rp <- rownames(list_of_dfs$Rp[list_of_dfs$Rp$S < 153, ])
outliers_Rp
 [1] "Pa_25_S2_Rp"  "Pf_21_S1_Rp"  "Pa_8_S3_Rp"   "Pa_21_S3_Rp"  "Pa_32_S2_Rp" 
 [6] "Ps_7_S3_Rp_1" "Pa_1_S1_Rp"   "Pf_32_S1_Rp"  "Ps_28_S1_Rp"  "Pa_24_S1_Rp" 
[11] "Pe_32_S1_Rp"  "Pe_36_S2_Rp"  "Pf_26_S2_Rp"  "Ps_16_S1_Rp"  "Ps_22_S3_Rp" 
[16] "Pf_13_S2_Rp" 
Code
hist(list_of_dfs$E$S,breaks=100)

Code
list_of_dfs$E$S
 [1] 2432 1095  922  585  580  566  544  516  359  348  270  229  212  212  204
[16]  189  172  143  133  126  125  123  117  107  106  106   99   97   94   94
[31]   94   91   86   86   77   73   72   67   66   64   61   60   59   58   52
[46]   51   51   51   45   37   33   29   29   22   21   17   17   15   15   15
[61]    7    7    6    6    6    5    5    4    4    4    3    3    3
Code
outliers_E <- rownames(list_of_dfs$E[list_of_dfs$E$S < 15, ])
outliers_E
 [1] "Pf_21_S1_E" "Pf_32_S1_E" "Pa_24_S1_E" "Pa_33_S1_E" "Pe_2_S3_E" 
 [6] "Ps_28_S1_E" "Ps_30_S1_E" "Pa_27_S1_E" "Pf_13_S2_E" "Ps_24_S1_E"
[11] "Pa_1_S1_E"  "Pa_32_S2_E" "Pf_26_S2_E"
Code
outliers_richness=c(outliers_Bulk,outliers_R,outliers_Rp,outliers_E)
outliers_richness
 [1] "Pe_NA_S2_Bulk_B"   "Ps_NA_S3_Bulk_A"   "Ps_NA_S1_Bulk_B"  
 [4] "Pf_NA_S1_Bulk_B_2" "Pa_NA_S3_Bulk_A"   "Pe_NA_S0_Bulk_A_2"
 [7] "Pa_NA_S3_Bulk_B_1" "Pa_NA_S1_Bulk_A_1" "Pa_NA_S0_Bulk_B"  
[10] "Pe_NA_S1_Bulk_A"   "Pa_23_S2_R"        "Pf_26_S2_R"       
[13] "Pa_13_S2_R"        "Ps_10_S2_R"        "Pf_34_S1_R_2"     
[16] "Ps_8_S3_R"         "Pe_19_S1_R"        "Pf_21_S1_R"       
[19] "Pe_32_S1_R"        "Ps_30_S1_R"        "Pa_21_S3_R"       
[22] "Pf_24_S2_R"        "Pf_18_S1_R"        "Ps_27_S1_R"       
[25] "Pa_19_S1_R"        "Pe_25_S1_R"        "Pa_4_S1_R"        
[28] "Pa_27_S1_R"        "Ps_26_S2_R"        "Pa_18_S1_R"       
[31] "Pa_1_S1_R"         "Ps_33_S3_R"        "Pa_10_S1_R"       
[34] "Pa_25_S2_Rp"       "Pf_21_S1_Rp"       "Pa_8_S3_Rp"       
[37] "Pa_21_S3_Rp"       "Pa_32_S2_Rp"       "Ps_7_S3_Rp_1"     
[40] "Pa_1_S1_Rp"        "Pf_32_S1_Rp"       "Ps_28_S1_Rp"      
[43] "Pa_24_S1_Rp"       "Pe_32_S1_Rp"       "Pe_36_S2_Rp"      
[46] "Pf_26_S2_Rp"       "Ps_16_S1_Rp"       "Ps_22_S3_Rp"      
[49] "Pf_13_S2_Rp"       "Pf_21_S1_E"        "Pf_32_S1_E"       
[52] "Pa_24_S1_E"        "Pa_33_S1_E"        "Pe_2_S3_E"        
[55] "Ps_28_S1_E"        "Ps_30_S1_E"        "Pa_27_S1_E"       
[58] "Pf_13_S2_E"        "Ps_24_S1_E"        "Pa_1_S1_E"        
[61] "Pa_32_S2_E"        "Pf_26_S2_E"       
Code
write.table(outliers_richness,file="outliers_richness_list.csv",sep=";")

mydf <- mydf[, !colnames(mydf) %in% outliers_richness]
design <- design[!rownames(design) %in% outliers_richness, ]
all(colnames(mydf)==rownames(design))
[1] TRUE

2.3 Mahalanobis outliers

Likely due to technical issues that do not reflect true biological variation, communities structure were removed

Code
all(rownames(design)==colnames(mydf))
mymat <- vegdist(t(log10(mydf+1)),method="bray")
mymat <- as.matrix(mymat)

set.seed(123)
nmds<-metaMDS(mymat, k=2, trymax = 100, autotransform = FALSE)

plot_data <- vegan::scores(nmds) %>% as_tibble(rownames = "samples")
plot_data$compartment <- design$comp  

calculate_mahalanobis <- function(data) {
  center <- colMeans(data[, c("NMDS1", "NMDS2")])
  cov_matrix <- cov(data[, c("NMDS1", "NMDS2")])
  mahal_dist <- mahalanobis(data[, c("NMDS1", "NMDS2")], center, cov_matrix)
  return(mahal_dist)
}

plot_data <- plot_data %>%
  group_by(compartment) %>%
  mutate(mahal_dist = calculate_mahalanobis(cur_data()))

threshold <- qchisq(0.95, df = 2)
plot_data$outlier <- ifelse(plot_data$mahal_dist > threshold, "Outlier", "Inlier")

nmds_plot_mahal <- ggplot(plot_data, aes(x = NMDS1, y = NMDS2, color = outlier)) +
  geom_point() +
  stat_ellipse(geom = "path", aes(group = compartment), level = 0.95) +
  geom_text(aes(label = samples), vjust = 1.5, hjust = 1.5, size = 1) +
  labs(title = "NMDS Plot with 95% Confidence Ellipses and Mahalanobis Outliers",
       x = "NMDS1", y = "NMDS2") +
  scale_color_manual(values = c("Inlier" = "black", "Outlier" = "red")) +
  theme_minimal()

print(nmds_plot_mahal)
Code
outliers_mahalanobis=(plot_data$samples[plot_data$outlier=="Outlier"])
outliers_mahalanobis
write.table(x=outliers_mahalanobis,file="outliers_NMDS_list.csv",sep=";")

Outliers are removed from the database

Code
mydf <- mydf[, !colnames(mydf) %in% outliers_mahalanobis]
design <- design[!rownames(design) %in% outliers_mahalanobis, ]
all(rownames(design)==colnames(mydf))
[1] TRUE
Code
alpha <- alpha[!rownames(alpha) %in% outliers_mahalanobis, ]

2.4 Subsampling on abundance

We removed ASVs whose sequencing depth was less than 0.01% of the total.

Code
counts_per_asv <- rowSums(mydf)
total_counts <- sum(counts_per_asv)
percent_counts_per_asv <- (counts_per_asv / total_counts) * 100
seuil_min_percent <- 0.01
seuil_min <- seuil_min_percent * total_counts / 100
mydf_hard_cleaned_0_01 <- mydf[counts_per_asv >= seuil_min, ]
all(rownames(design)==colnames(mydf_hard_cleaned_0_01))
[1] TRUE
Code
mydf=mydf_hard_cleaned_0_01

2.5 Sequencing efficacity

Rarefaction curves allow to ensure the completness of community coverage in terms of diversity.

Code
tmydf <- t(mydf);
x_range <- range(colSums(mydf))
y_range <- range(colSums(mydf!=0))
legend_x <- x_range[1] + 0.75 * diff(x_range)
legend_y <- y_range[2] - 0.2 * diff(y_range)

mycol_comp <- as.character(design$col_comp)

svg_filename = "plot/rarefaction_curve.svg"
svg(svg_filename)
rarecurve(tmydf,step=500,col=mycol_comp,label=FALSE,cex=0.5,ylab="Observed ASVs (n)",xlab="Recruited sequences (n)",main="Rarecurve by comp");
legend(legend_x, legend_y,legend =c("Bulk","R","Rp","E"),col=c("orange","green","blue","purple"),lty=1, cex=1)
dev.off()
png 
  2 

2.6 Alpha diversity

Calculate alpha diversity indexes

Code
tmydf<- t(mydf)

R<- colSums((mydf !=0))
C<- estimateR(tmydf)
S<- diversity(tmydf,index="simpson")
H<- diversity (tmydf, index="shannon")
E = H/log(R)
H<- data.frame(H)
S<- data.frame(S)
E<- data.frame(E)
C<- t(C)
SR<- 1/(1-S)
depth=colSums(mydf)

alpha<- cbind(C,S,SR,H,E,depth)

alpha$se.chao1 <- NULL 
alpha$se.ACE<- NULL 
names(alpha)<- c("S","C","A","Simpson","SimpsonRec","H","E","depth") 
head(alpha)
              S   C   A   Simpson SimpsonRec        H         E depth
Pa_10_S1_E   46  46  46 0.9553856   22.41431 3.446689 0.9002383  1265
Pa_10_S1_Rp 310 310 310 0.9923665  131.00185 5.334869 0.9299750  7821
Pa_12_S2_E   42  42  42 0.9683202   31.56587 3.572537 0.9558194   628
Pa_12_S2_R  317 317 317 0.9952162  209.03683 5.547288 0.9632545  5687
Pa_12_S2_Rp 414 414 414 0.9862078   72.50469 5.444482 0.9035187 10691
Pa_13_S2_E   42  42  42 0.9686563   31.90433 3.583906 0.9588611   806
Code
all(rownames(design)==rownames(alpha)) 
[1] TRUE
Code
alpha <- cbind(alpha,design) 

Plot alpha diversity

Code
length(alpha$comp[alpha$comp=="Bulk"])
[1] 20
Code
length(alpha$comp[alpha$comp=="R"])
[1] 45
Code
length(alpha$comp[alpha$comp=="Rp"])
[1] 52
Code
length(alpha$comp[alpha$comp=="E"])
[1] 56
Code
S<- alpha$S
Simpson<- alpha$Simpson
SimpsonRec<- alpha$SimpsonRec
H<- alpha$H
E<- alpha$E
alpha$col_comp <- factor(alpha$col_comp, levels = c("orange", "green", "blue", "purple"))
alpha$comp=factor(alpha$comp,levels=c("Bulk","R","Rp","E"))
col_comp=c("orange", "green", "blue", "purple")
labels=levels(alpha$comp)
variables <- c("S", "Simpson", "SimpsonRec", "H", "E")

srh=scheirerRayHare(S~comp+stage,data=alpha)

DV:  S 
Observations:  173 
D:  0.999978 
MS total:  2508.5 
Code
srh
            Df Sum Sq      H p.value
comp         3 230176 91.760 0.00000
stage        3  10659  4.249 0.23578
comp:stage   6   6660  2.655 0.85073
Residuals  160 168006               
Code
#Plot
for (variable in variables) {
  plot_name <- paste("plot_comp_", variable, sep = "")
  assign(plot_name, ggplot(alpha, aes(x = comp, y = .data[[variable]], fill = col_comp)) +
    geom_boxplot() +
    labs(title = paste(variable, "~ comp")) +
    scale_fill_manual(values = col_comp, name = "comp", labels = labels))
  
  ggsave(file = paste("plot/", variable, "~comp.svg", sep = ""), plot = get(plot_name))
  save(list = plot_name, file = paste("plot/", variable, "~comp.RData", sep = ""))
}

wb <- createWorkbook()
for (i in seq_along(variables)) {
  variable <- variables[i]
  kw_result <- agricolae::kruskal(get(variable), alpha$comp, group = TRUE, p.adj = "fdr")
  sheet_name <- variable
  addWorksheet(wb, sheetName = sheet_name)
  writeData(wb, sheet = sheet_name, x = kw_result$groups, startCol = 1, startRow = 1, rowNames = TRUE)
}
saveWorkbook(wb, file = "test_results/KW_alpha.xlsx", overwrite = TRUE)

kw_results <- list()

for (variable in variables) {
  variable_name <- substitute(variable)
  kw_result <- agricolae::kruskal(get(variable), alpha$comp, group = TRUE, p.adj = "fdr")
  colnames(kw_result$groups)[1] <- as.character(variable)
  kw_results[[variable]] <- kw_result
}

for (variable in variables) {
  print(kw_results[[variable]]$groups)
  print(kw_results[[variable]]$statistics)
}
             S groups
Bulk 129.97500      a
Rp   110.41346      b
R    107.66667      b
E     33.30357      c
     Chisq p.chisq
  98.11979       0
       Simpson groups
Bulk 118.25000      a
R    105.91111     ab
Rp    89.65385      b
E     58.17857      c
     Chisq      p.chisq
  32.89162 3.394745e-07
     SimpsonRec groups
Bulk  118.25000      a
R     105.91111     ab
Rp     89.65385      b
E      58.17857      c
     Chisq      p.chisq
  32.89162 3.394745e-07
             H groups
Bulk 134.30000      a
R    113.91111      b
Rp    98.21154      c
E     38.07143      d
     Chisq p.chisq
  86.87892       0
             E groups
Bulk 105.30000      a
R     98.93333      a
E     89.76786      a
Rp    66.65385      b
     Chisq     p.chisq
  13.97698 0.002936658

2.7 Beta diversity

Analyses of community structure

Code
all(colnames(mydf)==rownames(design))
[1] TRUE
Code
design$sample_name=rownames(design)
mydf=mydf[design$sample_name!="Ps_30_S1_Rp_2"]
design=subset(design,sample_name!="Ps_30_S1_Rp_2")

comp <-factor(design$comp,levels=c("Bulk","R","Rp","E"))
design$comp=factor(design$comp,levels=c("Bulk","R","Rp","E"))
design$specie[design$comp == "Bulk"] <- "none"

specie=design$specie
type <- design$type
stage=design$stage

mycap <- capscale(t(log10(mydf+1)) ~ comp*stage*specie,distance="bray");
mycap.aov <- anova(mycap,permutations = how(nperm=1000));
mycap.Rsq <- RsquareAdj(mycap);
mycap.aov;
Permutation test for capscale under reduced model
Permutation: free
Number of permutations: 1000

Model: capscale(formula = t(log10(mydf + 1)) ~ comp * stage * specie, distance = "bray")
          Df SumOfSqs      F   Pr(>F)    
Model     39   26.273 3.5978 0.000999 ***
Residual 133   24.904                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
mycap.Rsq;
$r.squared
[1] 0.513383

$adj.r.squared
[1] 0.3706908
Code
R2 <- round(mycap.Rsq[["r.squared"]],2)
mycap.term<-anova(mycap,by="term",permutations = how(nperm=1000));
mycap.term$SumOfSqs
[1] 14.235212  2.147829  2.010236  2.041755  1.518961  1.941091  2.378452
[8] 24.903724
Code
total= sum(mycap.term$SumOfSqs)
expl_var <- (mycap.term$SumOfSqs / total) * 100
mycap.term$expl_variance=expl_var
mycap.term;
Permutation test for capscale under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 1000

Model: capscale(formula = t(log10(mydf + 1)) ~ comp * stage * specie, distance = "bray")
                   Df SumOfSqs       F   Pr(>F) expl_variance
comp                3  14.2352 25.3414 0.000999        27.816
stage               3   2.1478  3.8235 0.000999         4.197
specie              3   2.0102  3.5786 0.000999         3.928
comp:stage          6   2.0418  1.8174 0.001998         3.990
comp:specie         6   1.5190  1.3520 0.038961         2.968
stage:specie        6   1.9411  1.7278 0.001998         3.793
comp:stage:specie  12   2.3785  1.0585 0.295704         4.647
Residual          133  24.9037                         48.662
Code
mycap.eig<-eigenvals(mycap);
my_eig <- summary(mycap.eig);
prop_cap1 <- round(my_eig[2, 1],2)*100
prop_cap2 <- round(my_eig[2, 2],2)*100

mycol <- as.character(design$col_comp)
mypch <- as.numeric(design$pch_stage)
name <- colnames(mydf)
base::plot(mycap, choices = c(1, 2),type="n",
               xlab = paste("CAP1 (", prop_cap1, "%)"),
               ylab = paste("CAP2 (", prop_cap2, "%)"),
               main = paste("Bray-Curtis ~ Comp*Stage (R2=", R2,")"))
points(mycap, choices=c(1,2), display = "sites", cex = 1.2, col=mycol,pch=mypch);
ordiellipse(mycap, choices=c(1,2),comp, display = "sites", conf=0.95, kind = "sd", draw= "polygon")
legend_labels1<- unique(design$comp)
legend_labels2<-unique(design$stage)
pch_symbols <- as.numeric(design$pch_stage[match(legend_labels2, design$stage)])
col_legend <- as.character(design$col_comp[match(legend_labels1, design$comp)])
legend(1,-0.5, legend =legend_labels2,pch=pch_symbols)
legend(3.2,-0.5, legend =legend_labels1,fill=col_legend)

Code
dev.copy(svg, "plot/beta_div_comp_stage_bray_3factors.svg")
svg 
  3 
Code
dev.off()
png 
  2 

2.8 Taxonomy

Taxonomical analyses

Code
taxonomy <- read.csv(file = "data/taxonomy.csv", sep = ";", head = TRUE)
mydf_tax <- mydf

design$sample_id <- rownames(design)
mydf_tax$ASV <- rownames(mydf_tax)

mydf_tax <- mydf_tax %>%
  pivot_longer(-ASV, names_to = "sample_id", values_to = "count")

mydf_tax <- mydf_tax %>%
  left_join(taxonomy, by = "ASV") %>%
  left_join(design, by = "sample_id")

summed_counts <- mydf_tax %>%
  group_by(genus, sample_id) %>%
  summarize(summed_count = sum(count, na.rm = TRUE)) %>%
  ungroup()

relative_abundance_by_genus <- summed_counts %>%
  group_by(sample_id) %>%
  mutate(relative_abundance = summed_count / sum(summed_count, na.rm = TRUE)) %>%
  ungroup()


mean_abundance_by_genus <- relative_abundance_by_genus %>%
  group_by(genus) %>%
  summarize(mean_relative_abundance = mean(relative_abundance, na.rm = TRUE)) %>%
  arrange(desc(mean_relative_abundance))


top_11 <- mean_abundance_by_genus %>%
  top_n(11, wt = mean_relative_abundance) %>%
  pull(genus)

relative_abundance_by_genus <- relative_abundance_by_genus %>%
  mutate(genus = ifelse(genus %in% top_11, as.character(genus), "others"))
unique(relative_abundance_by_genus$genus)
 [1] "others"                                            
 [2] "Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium"
 [3] "Bacillus"                                          
 [4] "Flavobacterium"                                    
 [5] "Hydrogenophaga"                                    
 [6] "Methylobacillus"                                   
 [7] "Methylotenera"                                     
 [8] "Noviherbaspirillum"                                
 [9] "Paeniglutamicibacter"                              
[10] "Pseudomonas"                                       
[11] "Rhizobacter"                                       
[12] "Rubrivivax"                                        
Code
relative_abundance_final <- relative_abundance_by_genus %>%
  group_by(sample_id, genus) %>%
  summarize(relative_abundance = sum(relative_abundance, na.rm = TRUE), .groups = 'drop')

data <- relative_abundance_final %>%
  left_join(design, by = "sample_id")
data$comp=factor(data$comp,levels=c("Bulk","R","Rp","E"))
data$stage=factor(data$stage,levels=c("S0","S1","S2","S3"))
data$comp_stage=factor(data$comp_stage,levels=c("Bulk_S0","Bulk_S1","Bulk_S2","Bulk_S3","R_S1","R_S2","R_S3","Rp_S1","Rp_S2","Rp_S3","E_S1","E_S2","E_S3"))

mean_abundance_by_genus_comp_stage <- data %>%
  group_by(genus, comp_stage) %>%
  summarize(mean_relative_abundance = mean(relative_abundance, na.rm = TRUE), .groups = 'drop')

mean_abundance_by_genus_comp_stage <- mean_abundance_by_genus_comp_stage %>%
  mutate(
    genus = case_when(
      grepl("Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium", genus) ~ "Rhizobium",
      TRUE ~ genus))

mean_abundance_by_genus_comp_stage <- mean_abundance_by_genus_comp_stage %>%
  mutate(
    comp = case_when(
      grepl("Bulk", comp_stage) ~ "Bulk",
      grepl("Rp", comp_stage) ~ "Rp",
      grepl("R", comp_stage) ~ "R",
      grepl("E", comp_stage) ~ "E",
      TRUE ~ NA_character_  # Valeur par défaut si aucune correspondance
    ),
    stage = case_when(
      grepl("S0", comp_stage) ~ "S0",
      grepl("S1", comp_stage) ~ "S1",
      grepl("S2", comp_stage) ~ "S2",
      grepl("S3", comp_stage) ~ "S3",
      TRUE ~ NA_character_  # Valeur par défaut si aucune correspondance
    ))

mean_abundance_by_genus_comp_stage <- mean_abundance_by_genus_comp_stage %>%
  mutate(genus = factor(genus, levels = mean_abundance_by_genus_comp_stage %>%
                              group_by(genus) %>%
                              summarize(mean_abundance = mean(mean_relative_abundance)) %>%
                              arrange(mean_abundance) %>%
                              pull(genus)))
mean_abundance_by_genus_comp_stage$comp=factor(mean_abundance_by_genus_comp_stage$comp,levels=c("Bulk","R","Rp","E"))

plot_genus <- mean_abundance_by_genus_comp_stage %>%
  ggplot(aes(x = comp_stage, y = mean_relative_abundance)) +
  facet_grid(~comp, scales = "free_x", space = "free_x") +
  geom_bar(aes(fill = genus), stat = "identity", position = "fill", width = 1) +
  scale_fill_brewer(palette = "Paired") +
  scale_y_continuous(name = "Relative abundance", labels = scales::percent) +
  theme(axis.text.x = element_text(angle = 90, size = 5),
        axis.text.y = element_text(color = "black"),
        strip.text = element_text(face = "bold"),
        strip.background = element_blank())
ggplotly(plot_genus)
Code
plot_genus

3 Phylogenetic inference

3.1 Rao’s diversity

Phylogenetic diversity (Rao’s index)

Code
tmydf <- log10(t(mydf) + 1)
phylo_dist_matrix <- cophenetic(tree)
common_taxa <- intersect(colnames(tmydf), rownames(phylo_dist_matrix))
tmydf <- tmydf[, common_taxa]
phylo_dist_matrix <- phylo_dist_matrix[common_taxa, common_taxa]
my_rao_pd <- rao.diversity(
  tmydf,
  phylodist = phylo_dist_matrix,
  standardize = TRUE
)

results_rao <- data.frame(
  id = rownames(tmydf),
  rao = my_rao_pd$PhyRao
)

if (!"id" %in% colnames(design)) {
  design$id <- rownames(design)
}

rao_data <- left_join(design, results_rao, by = "id")

rownames(rao_data) <- rao_data$id

rao_data$comp <- factor(rao_data$comp, levels = c("Bulk", "R", "Rp", "E"))

rao_plot_comp=ggplot(rao_data, aes(x = comp, y = rao, fill = comp)) +
  geom_boxplot() +
  scale_fill_manual(values = c("orange", "green", "blue", "purple")) +
  labs(x = "Plant Compartments", y = "Rao's Index") +
  ggtitle("Rao's index by compartments")+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))